From https://www.linkedin.com/learning/data-science-foundations-data-mining/anomaly-detection-in-r (Linkedin Premium)

Add IADB ML course

Set up

# # I am  executing bc it for the blog post 
# knitr::opts_chunk$set(eval = TRUE, 
#                            echo = TRUE, 
#                            tidy = FALSE, 
#                            results='hide',  
#                            message = FALSE, 
#                            warning = FALSE , fig.show='asis', fig.align='center', 
#                            fig.width=6, fig.height=6)

 
if (!require("PerformanceAnalytics")) install.packages("PerformanceAnalytics")
if (!require("ggcorrplot")) install.packages("ggcorrplot")
if (!require("GGally")) install.packages("GGally") # Ext to ggplot2 
if (!require("ggpubr")) install.packages("ggpubr")
if (!require("kableExtra")) install.packages("kableExtra")
if (!require("pander")) install.packages("pander") 

DIMENSIONALITY REDUCTION

DEFINITION:

EXMAPLES:

ALGORITHMs: + Principal Component Analysis (PCA) + Linear Discriminant Analysis (LDA) + Generalized Discriminant Analysis (GDA)


PRINCIPAL COMPONENT ANALISYS

From “psych” package documentation (p. 213) “The primary empirical difference between a components versus a factor model is the treatment of the variances for each item. Philosophically, components are weighted composites of observed variables while in the factor model, variables are weighted composites of the factors.”

# Conducting a principal components/factor analysis

# Load data 
data(mtcars)
mtcars[1:5, ]
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
mtcars1 <- mtcars[, c(1:4, 6:7, 9:11)]  # Select variables
mtcars1[1:5, ]
##                    mpg cyl disp  hp    wt  qsec am gear carb
## Mazda RX4         21.0   6  160 110 2.620 16.46  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 2.875 17.02  1    4    4
## Datsun 710        22.8   4  108  93 2.320 18.61  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.215 19.44  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.440 17.02  0    3    2
# ========= Principle components model using default method
# If using entire data frame:
pc <- prcomp(mtcars1,
             center = TRUE,  # Centers means to 0 (optional)
             scale = TRUE)  # Sets unit variance (helpful)

# Or specify variables:
# pc <- prcomp(~ mpg + cyl + disp + hp + wt + qsec + am + 
#                gear + carb, data = mtcars, scale = TRUE)

# ?prcomp  # Generally preferred
# ?princomp  # Very slightly different method, similar to S

# Get summary stats
summary(pc)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.3391 1.5299 0.71836 0.46491 0.38903 0.35099
## Proportion of Variance 0.6079 0.2601 0.05734 0.02402 0.01682 0.01369
## Cumulative Proportion  0.6079 0.8680 0.92537 0.94939 0.96620 0.97989
##                            PC7     PC8    PC9
## Standard deviation     0.31714 0.24070 0.1499
## Proportion of Variance 0.01118 0.00644 0.0025
## Cumulative Proportion  0.99107 0.99750 1.0000
# Screeplot
plot(pc)

# Get standard deviations and how variables load on PCs
pc
## Standard deviations (1, .., p=9):
## [1] 2.3391410 1.5299383 0.7183646 0.4649052 0.3890348 0.3509911 0.3171373
## [8] 0.2406989 0.1498962
## 
## Rotation (n x k) = (9 x 9):
##             PC1         PC2         PC3        PC4         PC5
## mpg  -0.4023287  0.02205294 -0.17272803 -0.1366169  0.31654561
## cyl   0.4068870  0.03589482 -0.27747610  0.1410976  0.02066646
## disp  0.4046964 -0.06479590 -0.17669890 -0.5089434  0.21525777
## hp    0.3699702  0.26518848 -0.01046827 -0.1273173  0.42166543
## wt    0.3850686 -0.15955242  0.33740464 -0.4469327 -0.21141143
## qsec -0.2168575 -0.48343885  0.54815205 -0.2545226  0.05466817
## am   -0.2594512  0.46039449 -0.19492256 -0.5354196 -0.55331460
## gear -0.2195660  0.50608232  0.34579810 -0.1799814  0.50533262
## carb  0.2471604  0.44322600  0.53847588  0.3203064 -0.25696817
##               PC6        PC7        PC8         PC9
## mpg  -0.718609897  0.3633216 -0.1487806  0.13567069
## cyl  -0.214224005  0.2099893  0.7951724  0.11635839
## disp  0.010052074  0.2007152 -0.1346748 -0.66099594
## hp   -0.254229405 -0.6741641 -0.1210386  0.25474680
## wt    0.002897706  0.3392809 -0.1598333  0.57211273
## qsec -0.226660704 -0.2986852  0.4144075 -0.19671599
## am   -0.087616182 -0.2135605  0.1897463 -0.02465169
## gear  0.393990378  0.2484622  0.2614819  0.05482771
## carb -0.398353829  0.1321064 -0.1054553 -0.31083546
# See how cases load on PCs
predict(pc)
##                             PC1         PC2         PC3          PC4
## Mazda RX4           -0.81883768  1.45577333 -0.21204263  0.315888300
## Mazda RX4 Wag       -0.78644303  1.26268953  0.04767210  0.119647855
## Datsun 710          -2.49423117  0.02762658 -0.32023017 -0.401948370
## Hornet 4 Drive      -0.29454234 -1.92903945 -0.32211475 -0.069818183
## Hornet Sportabout    1.56041411 -0.80821419 -1.04219408  0.050065675
## Valiant             -0.20722532 -2.19417266  0.14402455 -0.073226863
## Duster 360           2.73226603  0.29328994 -0.57716172  0.525124977
## Merc 240D           -1.79527743 -1.27281225  1.03388048  0.136366170
## Merc 230            -1.89734058 -1.92598643  1.95890184 -0.259206293
## Merc 280             0.01565012 -0.05866208  1.06454809  0.737712361
## Merc 280C            0.03629307 -0.22610850  1.28872352  0.683986341
## Merc 450SE           1.82083345 -0.68439747 -0.18980574  0.295092091
## Merc 450SL           1.60267678 -0.67977004 -0.27149159  0.401507010
## Merc 450SLC          1.71399687 -0.80382315 -0.07136381  0.369296647
## Cadillac Fleetwood   3.54393557 -0.78715158  0.61681226 -0.844299902
## Lincoln Continental  3.64660694 -0.72728678  0.64331413 -0.870281313
## Chrysler Imperial    3.39264826 -0.52198151  0.39635946 -0.820419326
## Fiat 128            -3.52803830 -0.23945546 -0.32703554 -0.516783758
## Honda Civic         -3.44178368  0.32746057 -0.42306580  0.167700576
## Toyota Corolla      -3.85421097 -0.29067456 -0.35299640 -0.412244409
## Toyota Corona       -1.64164478 -1.97896631  0.10056967  0.621710410
## Dodge Challenger     1.55167305 -0.86712498 -0.90521454  0.326318496
## AMC Javelin          1.44035057 -0.96337487 -0.77406360  0.368187375
## Camaro Z28           2.92480902  0.36716333 -0.57304474  0.526775004
## Pontiac Firebird     1.81339410 -0.90145453 -0.96469148 -0.314790674
## Fiat X1-9           -3.22172493 -0.06085364 -0.44753150 -0.200178011
## Porsche 914-2       -2.66209565  1.53159161 -0.27507492 -0.212645194
## Lotus Europa        -3.19041442  1.69409211 -0.52346685  0.008155493
## Ford Pantera L       1.59533098  3.09923346 -0.61246644 -0.694517979
## Ferrari Dino        -0.24630742  3.18027405  0.72936287  0.507145572
## Maserati Bora        2.62596044  4.40241877  0.97303537 -0.006628448
## Volvo 142E          -1.93672169  0.27969720  0.18785195 -0.463691632
##                             PC5         PC6           PC7          PC8
## Mazda RX4           -0.84958691 -0.01150126  0.2522589117  0.070182163
## Mazda RX4 Wag       -0.88755160 -0.08177799  0.2470770970  0.158396118
## Datsun 710          -0.36518038  0.53888511 -0.5002113448 -0.034749407
## Hornet 4 Drive       0.20547103 -0.04600804 -0.0108490874  0.008913676
## Hornet Sportabout    0.38197028 -0.13573066  0.1519111982  0.077208430
## Valiant             -0.08498911  0.26511187 -0.2594831624  0.275929962
## Duster 360           0.19900274 -0.21386156 -0.3957363363 -0.363215758
## Merc 240D            0.39973745  0.22142233  0.5428418769 -0.326885871
## Merc 230             0.60577005 -0.07860918 -0.3862488169  0.339834557
## Merc 280             0.13700873  0.10015509  0.4329982539 -0.004088557
## Merc 280C            0.08183421  0.19097540  0.2483130414  0.169616855
## Merc 450SE          -0.13790858 -0.17982680  0.0644632164  0.136576953
## Merc 450SL          -0.01105796 -0.31351178 -0.0326072369  0.216281162
## Merc 450SLC         -0.11991960 -0.11371190 -0.2087231635  0.352717370
## Cadillac Fleetwood  -0.35483328  0.14208110  0.1686960199 -0.096175655
## Lincoln Continental -0.35666482  0.12483822  0.1380129217 -0.166318506
## Chrysler Imperial   -0.06847485 -0.39460143  0.2568141030 -0.357074479
## Fiat 128            -0.02567396 -0.61745094  0.1111807743  0.026812599
## Honda Civic         -0.28378711 -0.45517710  0.1611472382 -0.085881908
## Toyota Corolla       0.12577796 -0.84883188  0.0006918617  0.159151765
## Toyota Corona        0.04761048  0.14446951 -0.6908183162 -0.456547134
## Dodge Challenger    -0.03467077  0.35437059  0.1896203162  0.198121231
## AMC Javelin         -0.04322194  0.33421087  0.0475150953  0.334345388
## Camaro Z28           0.05762007 -0.04009785 -0.3067172410 -0.471489439
## Pontiac Firebird     0.39111452 -0.19470872  0.3822511126 -0.037799957
## Fiat X1-9           -0.25319420  0.06217622 -0.1923901293  0.063485192
## Porsche 914-2        0.31823141  0.69486607  0.4076663225 -0.248004690
## Lotus Europa         0.78245261  0.05939704  0.1649363550 -0.219274124
## Ford Pantera L       0.68539841  0.59731381 -0.1758978468  0.419647806
## Ferrari Dino        -0.23921602  0.06422736  0.2232814502 -0.019483666
## Maserati Bora        0.27345257 -0.57263382 -0.5540820690  0.065079511
## Volvo 142E          -0.57652141  0.40354034 -0.4779124155 -0.185311588
##                              PC9
## Mazda RX4           -0.181855680
## Mazda RX4 Wag       -0.094402631
## Datsun 710           0.107758386
## Hornet 4 Drive      -0.123239037
## Hornet Sportabout   -0.150672423
## Valiant              0.017282559
## Duster 360          -0.168608127
## Merc 240D            0.034834992
## Merc 230            -0.189739133
## Merc 280             0.111701775
## Merc 280C            0.014135709
## Merc 450SE           0.399280466
## Merc 450SL           0.198722029
## Merc 450SLC          0.136650969
## Cadillac Fleetwood  -0.255615988
## Lincoln Continental -0.035108813
## Chrysler Imperial    0.221926989
## Fiat 128             0.214967700
## Honda Civic         -0.295988897
## Toyota Corolla       0.024795528
## Toyota Corona       -0.065421142
## Dodge Challenger    -0.028308674
## AMC Javelin         -0.057433273
## Camaro Z28           0.067421709
## Pontiac Firebird    -0.119242525
## Fiat X1-9            0.006363979
## Porsche 914-2        0.093645749
## Lotus Europa         0.020202599
## Ford Pantera L      -0.003396395
## Ferrari Dino        -0.006799694
## Maserati Bora       -0.037841219
## Volvo 142E           0.143982515
# Biplot
biplot(pc)

# =========== Factor Analysis
# Varimax rotation by default
# Gives chi square test that number of factors
# is sufficient to match data (want p > .05).
# Also gives uniqueness values for variables,
# variable loadings on factors, and variance
# statistics.
factanal(mtcars1, 1)
## 
## Call:
## factanal(x = mtcars1, factors = 1)
## 
## Uniquenesses:
##   mpg   cyl  disp    hp    wt  qsec    am  gear  carb 
## 0.162 0.121 0.086 0.308 0.198 0.775 0.642 0.723 0.737 
## 
## Loadings:
##      Factor1
## mpg  -0.915 
## cyl   0.937 
## disp  0.956 
## hp    0.832 
## wt    0.896 
## qsec -0.475 
## am   -0.599 
## gear -0.527 
## carb  0.513 
## 
##                Factor1
## SS loadings      5.249
## Proportion Var   0.583
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 146.3 on 27 degrees of freedom.
## The p-value is 2.41e-18
factanal(mtcars1, 2)
## 
## Call:
## factanal(x = mtcars1, factors = 2)
## 
## Uniquenesses:
##   mpg   cyl  disp    hp    wt  qsec    am  gear  carb 
## 0.156 0.106 0.090 0.083 0.158 0.261 0.202 0.195 0.302 
## 
## Loadings:
##      Factor1 Factor2
## mpg  -0.700  -0.595 
## cyl   0.658   0.679 
## disp  0.759   0.577 
## hp    0.355   0.889 
## wt    0.815   0.422 
## qsec  0.105  -0.853 
## am   -0.888         
## gear -0.875   0.198 
## carb          0.835 
## 
##                Factor1 Factor2
## SS loadings      3.857   3.590
## Proportion Var   0.429   0.399
## Cumulative Var   0.429   0.827
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 56.6 on 19 degrees of freedom.
## The p-value is 1.32e-05
factanal(mtcars1, 3)
## 
## Call:
## factanal(x = mtcars1, factors = 3)
## 
## Uniquenesses:
##   mpg   cyl  disp    hp    wt  qsec    am  gear  carb 
## 0.152 0.049 0.080 0.118 0.005 0.125 0.244 0.099 0.196 
## 
## Loadings:
##      Factor1 Factor2 Factor3
## mpg   0.610  -0.538  -0.431 
## cyl  -0.614   0.733   0.192 
## disp -0.703   0.558   0.339 
## hp   -0.263   0.820   0.375 
## wt   -0.738   0.292   0.604 
## qsec -0.149  -0.923         
## am    0.857          -0.132 
## gear  0.936           0.139 
## carb  0.153   0.678   0.567 
## 
##                Factor1 Factor2 Factor3
## SS loadings      3.513   3.218   1.202
## Proportion Var   0.390   0.358   0.134
## Cumulative Var   0.390   0.748   0.881
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 24.56 on 12 degrees of freedom.
## The p-value is 0.017
factanal(mtcars1, 4)  # First w/p > .05
## 
## Call:
## factanal(x = mtcars1, factors = 4)
## 
## Uniquenesses:
##   mpg   cyl  disp    hp    wt  qsec    am  gear  carb 
## 0.137 0.045 0.005 0.108 0.038 0.101 0.189 0.126 0.031 
## 
## Loadings:
##      Factor1 Factor2 Factor3 Factor4
## mpg   0.636  -0.445  -0.453  -0.234 
## cyl  -0.601   0.701   0.277   0.163 
## disp -0.637   0.555   0.176   0.500 
## hp   -0.249   0.721   0.472   0.296 
## wt   -0.730   0.219   0.417   0.456 
## qsec -0.182  -0.897  -0.246         
## am    0.891                  -0.100 
## gear  0.907           0.226         
## carb          0.478   0.851         
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      3.424   2.603   1.549   0.644
## Proportion Var   0.380   0.289   0.172   0.072
## Cumulative Var   0.380   0.670   0.842   0.913
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 6.06 on 6 degrees of freedom.
## The p-value is 0.416

somewhat related

(DETECT OUTLIERS)

# DM_05_03.R

# INSTALL AND LOAD PACKAGES ################################

pacman::p_load(ggplot2, grid, gridExtra, robustbase) 

# DATA #####################################################

# Import the data
data = read.csv(here::here( "AnomalyData.csv"))

# Structure of the data
str(data)
## 'data.frame':    48 obs. of  30 variables:
##  $ State            : Factor w/ 48 levels "Alabama","Arizona",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ state_code       : Factor w/ 48 levels "AL","AR","AZ",..: 1 3 2 4 5 6 7 8 9 11 ...
##  $ data.science     : num  -1 -0.42 -0.66 1.95 0.34 0.69 0.45 -0.73 -0.27 -0.67 ...
##  $ cluster.analysis : num  -0.13 -0.73 -0.39 -0.62 0 1.28 2.91 -1.38 -0.57 -1.12 ...
##  $ college          : num  1.1 -0.1 -0.64 -0.26 -0.61 1.17 -0.46 -0.3 0.25 -0.87 ...
##  $ startup          : num  -0.68 0.11 -0.08 2.02 1.49 0.41 0.14 -0.75 -1.13 1.3 ...
##  $ entrepreneur     : num  0.15 0.57 0.01 0.46 0.05 0.09 2.74 0.78 1.97 0.21 ...
##  $ ceo              : num  -0.73 0.25 -0.66 1.27 0.33 1.52 0.91 0.36 0.42 -0.59 ...
##  $ mortgage         : num  1.53 0.95 -0.5 -0.97 1.38 0.51 1.71 0.03 0.25 0.62 ...
##  $ nba              : num  -0.74 0.38 -0.71 1.46 -0.8 0.03 0.37 1.43 0.89 -1.47 ...
##  $ nfl              : num  -1.83 0.68 -1.59 -0.91 1.17 -0.64 1.63 -0.47 -0.53 -0.3 ...
##  $ mlb              : num  -1.3 0.14 -1.24 0.39 -0.51 1.25 0.86 0.06 -0.67 -1.35 ...
##  $ fifa             : num  -0.8 0.1 -0.88 1.94 -0.33 1.58 0.83 1.07 0.09 -0.9 ...
##  $ modern.dance     : num  -1.22 0.3 -1.36 0.33 -0.34 0.59 -0.16 -0.51 -0.81 0.21 ...
##  $ prius            : num  -0.74 1.19 -0.43 3.97 0.24 0.11 0.12 -0.02 -0.49 -0.03 ...
##  $ escalade         : num  0.1 0.78 0.93 -0.28 0.01 -1.14 -0.61 0.82 1.14 -0.6 ...
##  $ subaru           : num  -1.19 -0.7 -0.79 -0.52 1.37 1.17 -0.1 -1.01 -1.04 1.07 ...
##  $ jello            : num  -0.75 -0.58 -0.19 -1.21 -0.55 -0.78 -0.56 -1.07 -1.03 1.23 ...
##  $ bbq              : num  1.52 0.36 1.01 1.37 0.63 -1.05 -1.05 -0.18 1.22 0.19 ...
##  $ royal.family     : num  0.26 -1.06 -0.09 -1.04 -0.8 1.11 0.94 -0.84 -1.26 -0.83 ...
##  $ obfuscation      : num  -0.32 0.38 -0.45 0.7 1.32 0.47 0.11 -0.88 -0.71 1.05 ...
##  $ unicorn          : num  -1.03 0.1 -0.32 -0.38 0.2 0.09 -0.49 -1.1 -1.01 1.8 ...
##  $ Extraversion     : num  55.5 50.6 49.9 51.4 45.3 57.6 47 60.9 63.2 40.7 ...
##  $ Agreeableness    : num  52.7 46.6 52.7 49 47.5 38.6 38.8 50.7 60 52.9 ...
##  $ Conscientiousness: num  55.5 58.4 41 43.2 58.8 34.2 36.5 62.7 68.8 44.5 ...
##  $ Neuroticism      : num  48.7 38.1 56.2 39.1 34.3 53.4 62.4 40.8 38 44.2 ...
##  $ Openness         : num  42.7 54.7 40.3 65 57.9 53.9 42.7 61 56.9 44.7 ...
##  $ PsychRegions     : int  1 2 1 2 1 3 3 1 1 2 ...
##  $ region           : int  3 4 3 4 4 1 3 3 3 4 ...
##  $ division         : int  6 8 7 9 8 1 5 5 5 8 ...
# Transform variables to factors
data$PsychRegions = as.factor(data$PsychRegions)
data$region = as.factor(data$region)
data$division = as.factor(data$division)

# UNIVARIATE OUTLIERS ######################################

# Using boxplots for each variable separately

# data.science
u01 <- qplot(data = data, y = data.science, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab=NULL, ylab = NULL, 
         main="data.science") +
       geom_text(aes(label = ifelse(data.science %in% 
         boxplot.stats(data.science)$out,
         as.character(state_code), "")), hjust = 1.5)
u01

# cluster.analysis
u02 <- qplot(data = data,y = cluster.analysis, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL,
         main = "cluster.analysis") +
       geom_text(aes(label = ifelse(cluster.analysis %in% 
         boxplot.stats(cluster.analysis)$out,
         as.character(state_code), "")), hjust = 1.5)
u02

# college
u03 <- qplot(data = data, y = college, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="college") +
       geom_text(aes(label = ifelse(college %in% 
         boxplot.stats(college)$out,
         as.character(state_code), "")), hjust = 1.5)
u03

# startup
u04 <- qplot(data = data, y = startup, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="startup") +
       geom_text(aes(label = ifelse(startup %in% 
         boxplot.stats(startup)$out,
         as.character(state_code), "")), hjust = 1.5)
u04

# entrepreneur
u05 <- qplot(data = data, y = entrepreneur, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="entrepreneur") +
       geom_text(aes(label = ifelse(entrepreneur %in% 
         boxplot.stats(entrepreneur)$out,
         as.character(state_code), "")), hjust = 1.5)
u05

# ceo
u06 <- qplot(data = data, y = ceo, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="ceo") +
       geom_text(aes(label = ifelse(ceo %in% 
         boxplot.stats(ceo)$out,
         as.character(state_code), "")), hjust = 1.5)
u06

# mortgage
u07 <- qplot(data = data, y = mortgage, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="mortgage") +
       geom_text(aes(label = ifelse(mortgage %in% 
         boxplot.stats(mortgage)$out,
         as.character(state_code), "")), hjust = 1.5)
u07

# nba
u08 <- qplot(data = data, y = nba, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="nba") +
       geom_text(aes(label = ifelse(nba %in% 
         boxplot.stats(nba)$out,
         as.character(state_code), "")), hjust = 1.5)
u08

# royal.family
u09 <- qplot(data = data, y = royal.family, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="royal.family") +
       geom_text(aes(label = ifelse(royal.family %in% 
         boxplot.stats(royal.family)$out,
         as.character(state_code), "")), hjust = 1.5)
u09

# Neuroticism
u10 <- qplot(data = data, y = Neuroticism, x = 1, 
         geom = "boxplot", outlier.colour = "#E38942",
         xlim = c(0, 2), xlab = NULL, ylab = NULL, 
         main="Neuroticism") +
       geom_text(aes(label = ifelse(Neuroticism %in% 
         boxplot.stats(Neuroticism)$out,
         as.character(state_code), "")), hjust = 1.5)
u10

# Plot all 10 together
grid.arrange(u01, u02, u03, u04, u05,
             u06, u07, u08, u09, u10,
             nrow = 2, 
             top = "Boxplots: Univariate outliers")

# BIVARIATE OUTLIERS #######################################

# data.science vs cluster.analysis
b1 <- qplot(data = data, 
        x = data.science,
        y = cluster.analysis,
        main = "data.science vs cluster.analysis") +
      stat_ellipse(level = .99, color = "#E38942") +
      geom_text(aes(label =
      ifelse((data.science>1.8 | cluster.analysis>1.6),
        as.character(state_code), "")), 
        hjust = 1.5)
b1 

# mortgage vs ceo
b2 <- qplot(data = data,
        x = mortgage,
        y = ceo, 
        main = "mortgage vs ceo") +
      stat_ellipse(level = .99, color = "#E38942") +
      geom_text(aes(label =
        ifelse(ceo > 2,
        as.character(state_code), "")), 
        hjust = 1.5)
b2

# modern.dance vs Openness
b3 <- qplot(data = data,
        x = modern.dance, 
        y = Openness,
        main = "modern.dance vs Openness") +
      stat_ellipse(level = .99,color = "#E38942") +
      geom_text(aes(label =
        ifelse((modern.dance > 2 | Openness < 30),
        as.character(state_code),"")), 
        hjust = 1.5)
b3

# fifa vs nba
b4 <- qplot(data = data,
        x = fifa,
        y = nba, 
        main = "fifa vs nba") +
      stat_ellipse(level = .99, color = "#E38942") +
      geom_text(aes(label =
        ifelse(fifa > 2,
        as.character(state_code), "")), 
        hjust = 1.5)
b4

# subaru vs escalade
b5 <- qplot(data = data,
        x = subaru,
        y = escalade,
        main = "subaru vs escalade") +
      stat_ellipse(level = .99, color = "#E38942") +
      geom_text(aes(label =
        ifelse(subaru > 2.5,
        as.character(state_code), "")), 
        hjust = 1.5)
b5

# unicorn vs obsfucation
b6 <- qplot(data = data,
        x = unicorn,
        y = obfuscation,
        main = "unicorn vs obfuscation") +
      stat_ellipse(level = .99, color = "#E38942") +
      geom_text(aes(label =
        ifelse((unicorn > 2 | obfuscation > 2),
        as.character(state_code), "")), 
        hjust = 1.5)
b6

# Conscientiousness vs Extraversion
b7 <- qplot(data = data,
        x = Conscientiousness,
        y = Extraversion,
        main = "Conscientiousness vs Extraversion") +
      stat_ellipse(level = .99, color = "#E38942") 
b7

# college vs royal.family
b8 <- qplot(data = data,
        x = college,
        y = royal.family,
        main = "college vs royal.family") + 
     stat_ellipse(level = .99, color = "#E38942") 
b8

# Plot all 8 together
grid.arrange(b1, b2, b3, b4, b5, b6, b7, b8,
  nrow = 2, top = "Bivariate outliers")

# MULTIVARIATE OUTLIERS ####################################

# Measure overall distance of case from other using both
# Mahalanobis distance and a robust measures of distance.

# Create dataset with only quantitative variables
mcd = covMcd(data[-c(1, 2, 28, 29, 30)])
## Warning in covMcd(data[-c(1, 2, 28, 29, 30)]): n < 2 * p, i.e., possibly
## too small sample size
par(mfrow = c(1, 2))
# Mahalanobis vs. robust distance
plot(mcd, 
     which = "dd", 
     labels.id = as.character(data$state_code))
# QQ plot for robust distance
plot(mcd,
     which = "qqchi2",
     labels.id = as.character(data$state_code))

(ASSOCIATION ANALISYS)

# DM_06_03.R

# INSTALL AND LOAD PACKAGES ################################

pacman::p_load(arules, arulesViz) 

# DATA #####################################################

## Read transactional data from arules package
data("Groceries")   # Load data
#?Groceries          # Help on data
str(Groceries)      # Structure of data
## Formal class 'transactions' [package "arules"] with 3 slots
##   ..@ data       :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
##   .. .. ..@ i       : int [1:43367] 13 60 69 78 14 29 98 24 15 29 ...
##   .. .. ..@ p       : int [1:9836] 0 4 7 8 12 16 21 22 27 28 ...
##   .. .. ..@ Dim     : int [1:2] 169 9835
##   .. .. ..@ Dimnames:List of 2
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. ..@ factors : list()
##   ..@ itemInfo   :'data.frame':  169 obs. of  3 variables:
##   .. ..$ labels: chr [1:169] "frankfurter" "sausage" "liver loaf" "ham" ...
##   .. ..$ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
##   .. ..$ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
##   ..@ itemsetInfo:'data.frame':  0 obs. of  0 variables
summary(Groceries)  # Includes 5 most frequent items
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##        labels  level2           level1
## 1 frankfurter sausage meat and sausage
## 2     sausage sausage meat and sausage
## 3  liver loaf sausage meat and sausage
# RULES ####################################################

# Set minimum support (minSup) to .001
# Set minimum confidence (minConf) to .75

rules <- apriori(Groceries, 
           parameter = list(supp = 0.001, conf = 0.75))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.75    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [777 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
options(digits=2)
inspect(rules[1:10])
##      lhs                         rhs                support confidence lift count
## [1]  {liquor,                                                                    
##       red/blush wine}         => {bottled beer}      0.0019       0.90 11.2    19
## [2]  {curd,                                                                      
##       cereals}                => {whole milk}        0.0010       0.91  3.6    10
## [3]  {root vegetables,                                                           
##       cereals}                => {whole milk}        0.0010       0.77  3.0    10
## [4]  {yogurt,                                                                    
##       cereals}                => {whole milk}        0.0017       0.81  3.2    17
## [5]  {butter,                                                                    
##       jam}                    => {whole milk}        0.0010       0.83  3.3    10
## [6]  {whipped/sour cream,                                                        
##       instant coffee}         => {other vegetables}  0.0010       0.77  4.0    10
## [7]  {soups,                                                                     
##       bottled beer}           => {whole milk}        0.0011       0.92  3.6    11
## [8]  {yogurt,                                                                    
##       Instant food products}  => {whole milk}        0.0011       0.79  3.1    11
## [9]  {napkins,                                                                   
##       house keeping products} => {whole milk}        0.0013       0.81  3.2    13
## [10] {whipped/sour cream,                                                        
##       house keeping products} => {whole milk}        0.0012       0.92  3.6    12
# PLOTS ####################################################

# Scatterplot of support x confidence (colored by lift)
plot(rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

# Graph of top 20 rules
plot(rules[1:20], 
  method = "graph", 
  control = list(type = "items"))
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## main  =  Graph for 20 rules
## nodeColors    =  c("#66CC6680", "#9999CC80")
## nodeCol   =  c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF",  "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF",  "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol   =  c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF",  "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF",  "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha     =  0.5
## cex   =  1
## itemLabels    =  TRUE
## labelCol  =  #000000B3
## measureLabels     =  FALSE
## precision     =  3
## layout    =  NULL
## layoutParams  =  list()
## arrowSize     =  0.5
## engine    =  igraph
## plot  =  TRUE
## plot_options  =  list()
## max   =  100
## verbose   =  FALSE

# Parallel coordinates plot of top 20 rules
plot(rules[1:20], 
  method = "paracoord", 
  control = list(reorder = TRUE))

# Matrix plot of antecedents and consequents
plot(rules[1:20], 
     method = "matrix", 
      control = list(reorder = "none")# ,
      # measure=c("lift", "confidence")
      )
## Itemsets in Antecedent (LHS)
##  [1] "{liquor,red/blush wine}"                    
##  [2] "{curd,cereals}"                             
##  [3] "{root vegetables,cereals}"                  
##  [4] "{yogurt,cereals}"                           
##  [5] "{butter,jam}"                               
##  [6] "{whipped/sour cream,instant coffee}"        
##  [7] "{soups,bottled beer}"                       
##  [8] "{yogurt,Instant food products}"             
##  [9] "{napkins,house keeping products}"           
## [10] "{whipped/sour cream,house keeping products}"
## [11] "{root vegetables,house keeping products}"   
## [12] "{pastry,sweet spreads}"                     
## [13] "{turkey,curd}"                              
## [14] "{turkey,whipped/sour cream}"                
## [15] "{turkey,bottled water}"                     
## [16] "{turkey,root vegetables}"                   
## [17] "{rice,sugar}"                               
## [18] "{frozen vegetables,rice}"                   
## [19] "{butter,rice}"                              
## [20] "{domestic eggs,rice}"                       
## Itemsets in Consequent (RHS)
## [1] "{bottled beer}"     "{whole milk}"       "{other vegetables}"

# Grouped matrix plot of antecedents and consequents
plot(rules[1:20], method = "grouped")

(SEQUENTIAL PATTERNS (sub-type of association))

Sequence mining –> look for chains in the data (if this happens, that is likely to happen)

  • temporal association
  • ordinal association

EXE: - genetic sequencing - recommendation engines - marketing offers - behavioral state switching

ALGORITHMS + Generalized Sequential Patterns + Hidden MArkov model + SPADE, Sequential Pattern Discovery using Equivalence classes.

EXE Hidden MArkov model

Hidden Markov models are looking for switches in state conditions, or you might want to say qualitatively distinct patterns of behavior

# # CLEAN UP #################################################
# 
# # Clear workspace
# rm(list = ls()) 
# 
# # Clear packages
# pacman::p_unload(ggplot2, grid, gridExtra, robustbase)
# 
# # Clear plots
# dev.off()  # But only if there IS a plot
# 
# # Clear console
# cat("\014")  # ctrl+L
#